To reproduce this analysis on CellGenes JupyterHub first copy paste the following commands into the terminal to get data from our /nfs storage on the farm:
cd mkdir data/snQCandAnalysis/
mkdir data/snQCandAnalysis/OCT1_10x/
mkdir data/snQCandAnalysis/FFT4G_10x/
mkdir data/snQCandAnalysis/FFT4G_NEB/
Then replace aa16 in the following commands with your own sangerID and type in your farm3 password, when asked to do so, to access the sequencing data:
rsync -avzhr aa16@farm3-login:/nfs/team283/sequencing/snSeq/cellranger302_count_29507_5705STDY7945423_mm10-3_0_0_premrna/filtered_feature_bc_matrix data/snQCandAnalysis/FFT4G_10x/
gunzip data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix/features.tsv.gz
rsync -avzhr aa16@farm3-login:/nfs/team283/sequencing/snSeq/cellranger302_count_29507_5705STDY7945424_mm10-3_0_0_premrna/filtered_feature_bc_matrix data/snQCandAnalysis/OCT1_10x/
gunzip data/snQCandAnalysis/OCT1_10x/filtered_feature_bc_matrix/features.tsv.gz
rsync -avzhr aa16@farm3-login:/nfs/team283/sequencing/snSeq/tic-281/combined_premrna/premrna-study5705-tic281-star-genecounts.txt data/snQCandAnalysis/FFT4G_NEB/
Now we load we load the three datasets into R, as well as the required R packages:
require(Matrix)
library(Seurat)
require(myUtils)
require(ggplot2)
require(Hmisc)
library(SoupX)
require(biomaRt)
# Neb data:
data_NEB = read.delim('/home/jovyan/data/snQCandAnalysis/FFT4G_NEB/premrna-study5705-tic281-star-genecounts.txt', header = TRUE, row.names = 1)
# 10x data:
data_10xF = readMM('/home/jovyan/data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix/matrix.mtx.gz')
rowdata_10xF = read.delim('/home/jovyan/data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix/features.tsv', header = FALSE)
coldata_10xF = read.delim('/home/jovyan/data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix/barcodes.tsv.gz', header = FALSE)
data_10xF = as.matrix(data_10xF)
rownames(data_10xF) = rowdata_10xF[,2]
colnames(data_10xF) = coldata_10xF[,1]
data_10xO = readMM('/home/jovyan/data/snQCandAnalysis/OCT1_10x/filtered_feature_bc_matrix/matrix.mtx.gz')
rowdata_10xO = read.delim('/home/jovyan/data/snQCandAnalysis/OCT1_10x/filtered_feature_bc_matrix/features.tsv', header = FALSE)
coldata_10xO = read.delim('/home/jovyan/data/snQCandAnalysis/OCT1_10x/filtered_feature_bc_matrix/barcodes.tsv.gz', header = FALSE)
data_10xO = as.matrix(data_10xO)
rownames(data_10xO) = rowdata_10xO[,2]
colnames(data_10xO) = coldata_10xO[,1]
colnames(data_10xO) = paste(colnames(data_10xO),'2', sep = '')
The first QC step I made is to look at the fraction of counts/reads from mitochondrial genes and potentially remove any outliers:
# NEB
# Mitochondrial Genes:
# ensembl = useMart("ensembl")
# ensembl = useDataset("mmusculus_gene_ensembl",mart=ensembl)
# mtGenes = getBM(attributes = 'mgi_symbol', filter = 'chromosome_name', values = "MT", mart = ensembl)
# mtGenes = as.character(unlist(mtGenes))[as.character(unlist(mtGenes)) %in% rownames(data_10xF)]
mtGenes_symbol = c("mt-Nd1", "mt-Nd2", "mt-Co1", "mt-Co2", "mt-Atp8", "mt-Atp6", "mt-Co3",
"mt-Nd3", "mt-Nd4l", "mt-Nd4", "mt-Nd5", "mt-Nd6", "mt-Cytb")
mtGenes_eg = c('ENSMUSG00000064341', 'ENSMUSG00000064345', 'ENSMUSG00000064351', 'ENSMUSG00000064354', 'ENSMUSG00000064356',
'ENSMUSG00000064357', 'ENSMUSG00000064358', 'ENSMUSG00000064360', 'ENSMUSG00000065947', 'ENSMUSG00000064363',
'ENSMUSG00000064367', 'ENSMUSG00000064368', 'ENSMUSG00000064370')
# mtGenes_et = c('ENSMUST00000082392.1', 'ENSMUST00000082396.1', 'ENSMUST00000082402.1', 'ENSMUST00000082405.1', 'ENSMUST00000082407.1',
# 'ENSMUST00000082408.1', 'ENSMUST00000082409.1', 'ENSMUST00000082411.1', 'ENSMUST00000084013.1', 'ENSMUST00000082414.1',
# 'ENSMUST00000082418.1', 'ENSMUST00000082419.1', 'ENSMUST00000082421.1')
mtProp0 = colSums(data_NEB[mtGenes_eg,])/colSums(data_NEB)
mtProp1 = colSums(data_10xF[mtGenes_symbol,])/colSums(data_10xF)
mtProp2 = colSums(data_10xO[mtGenes_symbol,])/colSums(data_10xO)
mtDataFrame = data.frame(mtProp = c(mtProp0, mtProp1, mtProp2),
Run = c(rep('FFT4G_NEB', length(mtProp0)), rep('FFT4G_10x', length(mtProp1)), rep('OCT1_10x', length(mtProp2))))
p0 <- ggplot(mtDataFrame, aes(x=Run, y=mtProp, fill = Run)) +
scale_y_log10() +
geom_violin() +
geom_jitter(shape=16, size = 0.45, width = 0.45, height = 0.05) +
stat_summary(fun.y=mean, geom="point", size=2, color="red") +
ylab('Mitochondrial RNA Counts Proportion') +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14,face="bold"),
legend.text=element_text(size=14)) +
annotate("text", x = 0.7, y = 0.00001, label = "n(y = 0) = 3662") +
annotate("text", x = 1.85, y = 0.00001, label = "n(y = 0) = 160") +
annotate("text", x = 2.85, y = 0.00001, label = "n(y = 0) = 1284")
p0
pdf(file = 'figures/mitochondrialPercentage.pdf', width = 10, height = 10)
p0
dev.off()
## png
## 2
For now I did not remove any cells with high mitochondiral RNA count from the analysis, but in the future we can also choose to be more stringent.
Another QC metric is the expression of TNF-alpha (as an apoptosis marker), which I checked for next and turned out to be 0 in all cells:
tnfProp0 = data_NEB['ENSMUSG00000024401',]
tnfProp1 = data_10xF['Tnf',]
tnfProp2 = data_10xO['Tnf',]
print(sum(tnfProp0))
## [1] 0
print(sum(tnfProp1))
## [1] 0
print(sum(tnfProp2))
## [1] 0
Next I considered the number of detected Genes and total counts/UMI counts per cells (with UMI counts I refer to unique transcript counts in the 10x technology, while counts from NEB can correspond to the same transcript, so both are not directly comparable, but still give a rough overview of the data quality…).
genesDetected1 = colSums(data_10xF > 0)
genesDetected2 = colSums(data_10xO > 0)
genesDetected3 = colSums(data_NEB > 0)
readsDetected1 = colSums(data_10xF)
readsDetected2 = colSums(data_10xO)
readsDetected3 = colSums(data_NEB)
numberOfCells1 = length(genesDetected1)
numberOfCells2 = length(genesDetected2)
numberOfCells3 = length(genesDetected3)
qcDataFrame = data.frame(genesDetected = c(genesDetected1, genesDetected2, genesDetected3),
readsDetected = c(readsDetected1, readsDetected2, readsDetected3),
Run = c(rep('FFT4G_10x', length(genesDetected1)), rep('OCT1_10x', length(genesDetected2)), rep('FFT4G_NEB', length(mtProp0))))
p1 <- ggplot(qcDataFrame, aes(x=Run, y=genesDetected, fill = Run)) +
geom_violin() +
geom_jitter(shape=16, size = 0.25, width = 0.45, height = 0.45) +
stat_summary(fun.data=mean_sdl, mult=1, geom="pointrange", color="red") +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14,face="bold"),
legend.text=element_text(size=14)) +
annotate("text", x = 0.7, y = 5000, label = "n = 4865") +
annotate("text", x = 1.85, y = 2000, label = "n = 384") +
annotate("text", x = 2.85, y = 5000, label = "n = 5814") + ylab('Genes Detected')
p1
pdf(file = 'figures/NumberOfDetectedGenes.pdf', width = 10, height = 10)
p1
dev.off()
## png
## 2
p2 <- ggplot(qcDataFrame, aes(x=Run, y=readsDetected, fill = Run)) +
scale_y_log10() +
geom_violin() +
geom_jitter(shape=16, size = 0.25, width = 0.45, height = 0.45) +
stat_summary(fun.data=mean_sdl, mult=1, geom="pointrange", color="red") +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14,face="bold"),
legend.text=element_text(size=14)) +
annotate("text", x = 0.7, y = 30000, label = "n = 4865") +
annotate("text", x = 1.85, y = 30000, label = "n = 384") +
annotate("text", x = 2.85, y = 30000, label = "n = 5814") + ylab('Counts/UMI Counts Detected')
p2
pdf(file = 'figures/NumberOfReadsOrCounts.pdf', width = 10, height = 10)
p2
dev.off()
## png
## 2
Now it is obvious from both plots that there are still a few low quality cells in the NEB dataset (the 10x software cellranger already removed low count cells in the 10x datasets beforehand). So here I note for later analysis that I will only analyze cells with more than 2000 detected genes:
keepNEBafterQC = (colSums(data_NEB != 0) > 2000)
For doublet removal I ran the scrublet algorithm (see https://www.cell.com/cell-systems/pdfExtended/S2405-4712(18)30474-5) in python, which can be replicated by running the three scripts in this repository called: removeDoublets_FFT4G10x.py, removeDoublets_OCT110x.py and removeDoublets_FFT4GNEB.py . In a first step the scrublet algorithm simulates doublets by randomly mixing pairs of cells. Then it assigns both simulated and real cells a position in low dimensional space. Cells then get a doublet score, by counting the number of nearest neighbours that are simulated doublets. In this way a cells gets a doublet score of 0, when no neighbours are doublets and a doublet score of 1, when all neighbours are doublets. The python scripts mentioned above, save all the output from the algorithm in the /scrublet directory, as well as these plots of doublet scores for the real cell types and the simulated ones, which we can look at now:
knitr::include_graphics("scrublet/FFT4G_10x_doubletScore_histogram.pdf")
knitr::include_graphics("scrublet/OCT1_10x_doubletScore_histogram.pdf")
knitr::include_graphics("scrublet/FFT4G_NEB_doubletScore_histogram.pdf")
Many of the simulated doublets actually have quite a low doublet score, this is simply because a mix of the same cell type, called monotypic, is harder to identify as a doublet than a mix of two different cell types called neotypic. The scrublet algorithm aims to find neotypic doublets that should show up as a second peak on the simulate doublets histogram. It then automatically sets a threshold between the two peaks to remove likely doublets. This threshold can be decreased to remove more doublets at the expense of removing good quality samples too. For now I leave the conservative threshold as it is. For the FFT4G_NEB run I think the algorithm was too strict and I will only remove cells with a doublet score above 0.4:
doublets_10xF = read.table('scrublet/FFT4G_10x_predicted_doublets.txt')
doublets_10xO = read.table('scrublet/OCT1_10x_predicted_doublets.txt')
doubletsScore_NEB = read.table('scrublet/FFT4G_NEB_doublets_scores.txt')
keep10xF = (doublets_10xF == 0)
keep10xO = (doublets_10xO == 0)
keepNEB = (doubletsScore_NEB < 0.4 & keepNEBafterQC == 1)
data_10xF_QC = data_10xF[, keep10xF]
data_10xO_QC = data_10xO[, keep10xO]
data_NEB_QC = data_NEB[, keepNEB]
Maybe the best metric for the quality of the snSeq run is the extent to which it can identify biological variability. For this purpose, I embeded each dataset in 2 dimensions using the UMAP algorithm and visualized the expression of important cell type specific marker genes. As a comparison, I used the Allen smartSeq4 data (see https://www.ncbi.nlm.nih.gov/pubmed/30382198), which is a high quality reference dataset. I also used the marker genes they suggested in their publication. In this chunck of code I load the Allen data and reduce all 4 datasets to the sets of genes that are shared between them.
options(stringsAsFactors = FALSE)
dataAllen = readRDS('/home/jovyan/data/Allen/mouse_VISp_2018-06-14_exon+intron_counts-matrix.rds')
rowdataA = read.delim('/home/jovyan/data/Allen/mouse_VISp_2018-06-14_genes-rows.csv', sep = ',')
coldataA = read.delim('/home/jovyan/data/Allen/mouse_VISp_2018-06-14_samples-columns.csv', sep = ',')
rownames(dataAllen) = rowdataA[,1]
celltypes = coldataA[,'class']
keep = celltypes %in% c('GABAergic', 'Endothelial', 'Glutamatergic', 'Non-Neuronal')
coldataA = coldataA[keep,]
dataAllen = dataAllen[,keep]
celltypes = celltypes[keep]
subtypes = unlist(lapply(1:dim(coldataA)[1], function(x) paste(coldataA[x,c('class', 'subclass')], sep = '_', collapse = '_')))
subtypes = gsub(" ", "_", subtypes, fixed=TRUE)
subtypes = gsub("-", "", subtypes, fixed=TRUE)
dataAllen = dataAllen[,2:dim(dataAllen)[2]]
rownames(data_NEB_QC) = mapIdsMouse(rownames(data_NEB_QC), IDFrom = 'ENSEMBL', IDTo = 'SYMBOL')
commonGenes = intersect(intersect(rownames(data_NEB_QC), rownames(data_10xF_QC)), intersect(rownames(data_10xO_QC), rownames(dataAllen)))
dataAllen = dataAllen[commonGenes,]
data_10xF_QC = data_10xF_QC[commonGenes,]
data_10xO_QC = data_10xO_QC[commonGenes,]
data_NEB_QC = data_NEB_QC[commonGenes,]
Now we plot all four datasets side by side, visualizing different marker genes (4 for glutamatergic, 4 for GABAergic neurons) and in the last plot also the number of detected genes.
visualCortexData = cbind(dataAllen,data_10xF_QC,data_10xO_QC,data_NEB_QC)
metaData = cbind(c(rep('Allen', dim(dataAllen)[2]), rep('10x_FFT', dim(data_10xF)[2]), rep('10x_OCT', dim(data_10xO)[2]),rep('NEB_FFT', dim(data_NEB)[2])),
c(coldataA[,'class'], rep('Unknown', dim(data_10xF)[2]), rep('Unknown', dim(data_10xO)[2]), rep('Unknown', dim(data_NEB)[2])),
c(subtypes, rep('Unknown', dim(data_10xF)[2]), rep('Unknown', dim(data_10xO)[2]), rep('Unknown', dim(data_NEB)[2])))
colnames(metaData) = c('tech', 'celltype', 'subtype')
rownames(metaData) = c(colnames(dataAllen), colnames(data_10xF), colnames(data_10xO), colnames(data_NEB))
metaData = as.data.frame(metaData)
visualCortex <- CreateSeuratObject(visualCortexData, meta.data = metaData)
visualCortex.list <- SplitObject(visualCortex, split.by = 'tech')
for (i in 1:length(visualCortex.list)) {
visualCortex.list[[i]] <- NormalizeData(visualCortex.list[[i]], verbose = FALSE)
visualCortex.list[[i]] <- FindVariableFeatures(visualCortex.list[[i]], selection.method = "vst", nfeatures = 2000,
verbose = FALSE)
}
for (i in 1:length(visualCortex.list)) {
visualCortex.list[[i]] <- ScaleData(visualCortex.list[[i]], verbose = FALSE)
visualCortex.list[[i]] <- RunPCA(visualCortex.list[[i]], npcs = 30, verbose = FALSE)
visualCortex.list[[i]] <- RunUMAP(visualCortex.list[[i]], reduction = "pca", dims = 1:30)
}
visualCortex.list[[2]]$Log10mitoFraction = log(mtProp1,10)
visualCortex.list[[3]]$Log10mitoFraction = log(mtProp2,10)
visualCortex.list[[4]]$Log10mitoFraction = log(mtProp0,10)
featureList = c('Slc30a3', 'Cux2', 'Rorb', 'Scnn1a',
'Sst', 'Lamp5', 'Ndnf', 'Vip', 'nFeature_RNA')
for (j in 1:length(featureList)){
featurePlots = list()
for (i in 1:length(visualCortex.list)) {
featurePlots[[i]] = FeaturePlot(visualCortex.list[[i]], feature = featureList[[j]], reduction = "umap", cols = c('yellow', 'red'))
}
grDevices::pdf(NULL)
p = cowplot::plot_grid(featurePlots[[1]], featurePlots[[2]], featurePlots[[3]], featurePlots[[4]], labels = c('Allen','10x_FFT','10x_OCT', 'NEB_FFT'))
grDevices::dev.off()
cowplot::plot_grid(featurePlots[[1]], featurePlots[[2]], featurePlots[[3]], featurePlots[[4]], labels = c('Allen','10x_FFT','10x_OCT', 'NEB_FFT'))
pdf(file = paste('figures/UMAP_', featureList[[j]], '.pdf', sep = ','), width = 10, height = 10)
print(p)
dev.off()
print(p)
}
Overall it looks pretty good. Marker genes that are expressed highly in specific cluster in the Allen data, also fall into specific clusters in our snSeq data. However, it is obvious from the last 4 marker genes that our proportion of GABAergic neurons is smaller than theirs. The last plot with the number of detected genes, assures us that there is no obvious clustering based on this feature.
We can investigate differences in cell type proportions further by classifying each cell into its class, using the Allen data as a reference. I use the data integration methods in Seurate for this, which identifies ‘anchors’ of obviously similar cells across datasets and then uses these to map all remaining cells to their closest equivalent in the other data sets. Here is the general function that does this:
predictCellType = function(queryData, referenceData, labels = referenceData$celltype){
anchors <- FindTransferAnchors(reference = referenceData, query = queryData,
dims = 1:30)
predictions <- TransferData(anchorset = anchors, refdata = labels,
dims = 1:30)
predictions_top = predictions[,1]
predictions_score_all = predictions[,]
predictions_score = colSums(predictions[,2:(dim(predictions)[2]-1)])
predictions_score = predictions_score/sum(predictions_score)
names(predictions_score) = unlist(lapply(1:length(predictions_score), function(x) paste(strsplit(names(predictions_score)[x], split = '\\.')[[1]][-c(1,2)], sep = '', collapse = '')))
return(list(predictions_top, predictions_score, predictions_score_all))
}
I run this to predict both celltype and subtypes:
celltypePrediction = list()
queryList = c('10x_FFT', '10x_OCT', 'NEB_FFT')
print('Predicting celltypes ...')
## [1] "Predicting celltypes ..."
for (i in 1:length(queryList)){
print(queryList[[i]])
celltypePrediction[[i]] = predictCellType(visualCortex.list[[queryList[[i]]]], visualCortex.list[['Allen']])
}
## [1] "10x_FFT"
## [1] "10x_OCT"
## [1] "NEB_FFT"
names(celltypePrediction) = queryList
subtypePrediction = list()
queryList = c('10x_FFT', '10x_OCT', 'NEB_FFT')
print('Predicting subtypes ...')
## [1] "Predicting subtypes ..."
for (i in 1:length(queryList)){
print(queryList[[i]])
subtypePrediction[[i]] = predictCellType(visualCortex.list[[queryList[[i]]]], visualCortex.list[['Allen']], labels = visualCortex.list[['Allen']]$subtype)
}
## [1] "10x_FFT"
## [1] "10x_OCT"
## [1] "NEB_FFT"
names(subtypePrediction) = queryList
### Add the predicted cell type labels to the metadata:
visualCortex.list[[2]]$celltype = celltypePrediction[[1]][[1]]
visualCortex.list[[3]]$celltype = celltypePrediction[[2]][[1]]
visualCortex.list[[4]]$celltype = celltypePrediction[[3]][[1]]
visualCortex.list[[2]]$subtype = subtypePrediction[[1]][[1]]
visualCortex.list[[3]]$subtype = subtypePrediction[[2]][[1]]
visualCortex.list[[4]]$subtype = subtypePrediction[[3]][[1]]
visualCortex.list[[2]]$celltypeScore = celltypePrediction[[1]][[3]][,6]
visualCortex.list[[3]]$celltypeScore = celltypePrediction[[2]][[3]][,6]
visualCortex.list[[4]]$celltypeScore = celltypePrediction[[3]][[3]][,6]
visualCortex.list[[2]]$subtypeScore = subtypePrediction[[1]][[3]][,25]
visualCortex.list[[3]]$subtypeScore = subtypePrediction[[2]][[3]][,25]
visualCortex.list[[4]]$subtypeScore = subtypePrediction[[3]][[3]][,25]
And visualize the results in two bar plots, together with cell type proportions from the Allen data set:
celltypesAllen = table(coldataA['class'])
celltypesAllen = celltypesAllen/sum(celltypesAllen)
names(celltypesAllen)[4] = 'NonNeuronal' # Need to correct this one for consistency as the '-' in Non-Neuronal was ommited for the other datasets
celltypesAllen = celltypesAllen[names(celltypePrediction[[1]][[2]])]
subtypesAllen = table(subtypes)
subtypesAllen = subtypesAllen/sum(subtypesAllen)
names(subtypesAllen)[12] = "Glutamatergic_L23_IT"
subtypesAllen = subtypesAllen[names(subtypePrediction[[1]][[2]])]
celltypeProportions = data.frame(Celltype = names(celltypePrediction[[1]][[2]]), Proportion = c(celltypesAllen, celltypePrediction[[1]][[2]], celltypePrediction[[2]][[2]], celltypePrediction[[3]][[2]]), Run = c(rep('Allen', length(celltypesAllen)),
rep('FFT_10X', length(celltypePrediction[[1]][[2]])), rep('OCT_10X', length(celltypePrediction[[2]][[2]])), rep('FFT_SmartSeq', length(celltypePrediction[[3]][[2]]))))
subtypeProportions = data.frame(Celltype = names(subtypePrediction[[1]][[2]]), Proportion = c(subtypesAllen, subtypePrediction[[1]][[2]], subtypePrediction[[2]][[2]], subtypePrediction[[3]][[2]]), Run = c(rep('Allen', length(subtypesAllen)),
rep('FFT_10X', length(subtypePrediction[[1]][[2]])), rep('OCT_10X', length(subtypePrediction[[2]][[2]])), rep('FFT_SmartSeq', length(subtypePrediction[[3]][[2]]))))
library(tidyr)
library(ggplot2)
p2 = ggplot(data = celltypeProportions,
aes(x = Celltype, y = Proportion, fill = Run)) +
geom_bar(stat = 'identity', position = 'dodge')
p2
p3 = ggplot(data = subtypeProportions,
aes(x = Celltype, y = Proportion, fill = Run)) +
geom_bar(stat = 'identity', position = 'dodge') + theme(axis.text.x = element_text(angle = 90, hjust = 1))
p3
pdf(file = 'figures/Classification_CellClasses.pdf', width = 10, height = 5)
p2
dev.off()
## png
## 2
pdf(file = 'figures/Classification_CellSubClasses.pdf', width = 10, height = 5)
p3
dev.off()
## png
## 2
Now the Seurat method actually returns a score for each cell that is 1 if it is confidently mapped to a cell class and lower if it is less confident. Importantly, it can also be 0 across all cell classes, if no cell class in the reference data seems to match. Now in the code above I summed up the score for each cell class to get an idea of the proportions. It is good to check how these scores are distributed. Are most of them close to 1? Are some of them 0, indicating that we have cells that are not found in the reference data?
for (i in 1:length(celltypePrediction)){
df1 = data.frame(score = unname(celltypePrediction[[i]][[3]][6]), type = rep('general', length(celltypePrediction[[i]][[3]][6])))
h1 = ggplot(df1, aes(x=score)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666") +ggtitle(names(celltypePrediction)[[i]])
df2 = data.frame(score = unname(subtypePrediction[[i]][[3]][25]), type = rep('general', length(subtypePrediction[[i]][[3]][6])))
h2 = ggplot(df2, aes(x=score)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666") +ggtitle(names(celltypePrediction)[[i]])
p = cowplot::plot_grid(h1, h2, labels = c(' Celltypes', ' Subtypes'))
pdf(file = paste('figures/ClassificationScoreDistribution_', names(celltypePrediction)[[i]], '.pdf', sep = ','), width = 10, height = 20)
print(p)
dev.off()
print(p)
}
In general, given the prediction scores for the general cell types are all close to 1 there are likely no cell types that are not in the reference dataset. For the subtype prediction there are many scores of 1, but also significantly below 1, showing the difficulty of classifying cells into type with only fine distinctions. In their paper they have shown that a score > 0.5 can be trusted, so that is most of the datasets. The average subtype prediction score is an unfair comparison between the datasets, since the prediction uses information from similar cells within the dataset in the calculation and thus datasets with more cells benefit. Still for completenes I plot it below:
library(tidyr)
library(ggplot2)
classificationScores = data.frame(MeanScore = c(mean(subtypePrediction[[1]][[3]][[25]]), mean(subtypePrediction[[2]][[3]][[25]]), mean(subtypePrediction[[3]][[3]][[25]])),
Run = names(subtypePrediction))
p2 = ggplot(data = classificationScores,
aes(x = Run, y = MeanScore, fill = Run)) +
geom_bar(stat = 'identity', position = 'dodge')
p2
pdf(file = 'figures/ClassificationScoresMean.pdf', width = 10, height = 5)
p2
dev.off()
## png
## 2
Now for a final comparison between the different protocols and to see if there are any biases between them, such as a cell cluster absent in one protocol, I produce joint 2D embeddings for the data below. For this purpose I use again Seurat’s anchor based method, which has the advantage that it will successfully integrate different datasets even if certain cell populations only exist in one of the datasets. Of particular interest are pairwise comparisons between the two 10x runs and the two runs with fresh frozen tissue:
reference.list <- visualCortex.list[c("10x_FFT", "10x_OCT")]
visualCortex10x.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
visualCortex.10xintegrated <- IntegrateData(anchorset = visualCortex10x.anchors, dims = 1:30)
library(ggplot2)
library(cowplot)
# switch to integrated assay. The variable features of this assay are automatically set during
# IntegrateData
DefaultAssay(visualCortex.10xintegrated) <- "integrated"
# Run the standard workflow for visualization and clustering
visualCortex.10xintegrated <- ScaleData(visualCortex.10xintegrated, verbose = FALSE)
visualCortex.10xintegrated <- RunPCA(visualCortex.10xintegrated, npcs = 30, verbose = FALSE)
visualCortex.10xintegrated <- RunUMAP(visualCortex.10xintegrated, reduction = "pca", dims = 1:30)
# VisualCortex.snSeq = subset(visualCortex.integrated, cells = c(colnames(data_10xF), colnames(data_10xO)))
p4 <- DimPlot(visualCortex.10xintegrated, reduction = "umap", group.by = "tech")
p4
pdf(file = 'figures/10xJointEmbedding.pdf', width = 10, height = 5)
p4
dev.off()
## png
## 2
reference.list <- visualCortex.list[c("10x_FFT", "NEB_FFT")]
visualCortexFFT.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30)
visualCortex.FFTintegrated <- IntegrateData(anchorset = visualCortexFFT.anchors, dims = 1:30)
library(ggplot2)
library(cowplot)
# switch to integrated assay. The variable features of this assay are automatically set during
# IntegrateData
DefaultAssay(visualCortex.FFTintegrated) <- "integrated"
# Run the standard workflow for visualization and clustering
visualCortex.FFTintegrated <- ScaleData(visualCortex.FFTintegrated, verbose = FALSE)
visualCortex.FFTintegrated <- RunPCA(visualCortex.FFTintegrated, npcs = 30, verbose = FALSE)
visualCortex.FFTintegrated <- RunUMAP(visualCortex.FFTintegrated, reduction = "pca", dims = 1:30)
# VisualCortex.snSeq = subset(visualCortex.integrated, cells = c(colnames(data_10xF), colnames(data_10xO)))
p5 <- DimPlot(visualCortex.FFTintegrated, reduction = "umap", group.by = "tech")
p5
pdf(file = 'figures/FFTJointEmbedding.pdf', width = 10, height = 5)
p5
dev.off()
## png
## 2
Now there seems to be a cluster of cells that is only present in 10x_OCT and not 10x_FFT. To investigate these further I coloured the same plot based on number of detected genes, classyfied cell type, classification score and mitochondrial transcript count fraction:
p15 = DimPlot(visualCortex.10xintegrated, reduction = "umap", group.by = "subtype")
p16 = FeaturePlot(visualCortex.10xintegrated, feature = "nFeature_RNA", reduction = "umap", cols = c('yellow', 'red'))
p17 = FeaturePlot(visualCortex.10xintegrated, feature = "subtypeScore", reduction = "umap", cols = c('yellow', 'red'))
p18 = FeaturePlot(visualCortex.10xintegrated, feature = "Log10mitoFraction", reduction = "umap", cols = c('yellow', 'red'))
p15
p16
p17
p18
pdf(file = 'figures/UMAP_10x_subtypes.pdf', width = 10, height = 5)
p15
dev.off()
## png
## 2
pdf(file = 'figures/UMAP_10x_nCounts.pdf', width = 10, height = 5)
p16
dev.off()
## png
## 2
pdf(file = 'figures/UMAP_10x_subtypeScore.pdf', width = 10, height = 5)
p17
dev.off()
## png
## 2
pdf(file = 'figures/UMAP_10x_Log10mitoFraction.pdf', width = 10, height = 5)
p18
dev.off()
## png
## 2
The number of counts, classification scores and mitochondrial counts fraction are not strongly out of the expected range. So it may simply be that this is a further subcategory of glutamatergic_L5_PT neurons that is only present in the OCT samples. Isolating the cells in this cluster and looking at DE genes compared to the rest will be my next step.